started: Alexey Larionov, 30Jan2017
last updated: Alexey Larionov, 30Jan2017

Summary

  1. Compare eigenvectors calculated by R and by Eigenstrat PCA

start_section

# Time stamp
Sys.time()
## [1] "2017-01-30 20:05:00 GMT"
# Folders
setwd("/scratch/medgen/scripts/wecare_stat_01.17/scripts")
interim_data_folder <- "/scratch/medgen/scripts/wecare_stat_01.17/interim_data"

# Required libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

read_data

# --- r-calculated data --- #

load(paste(interim_data_folder, "r05_calculate_egenvectors_wecare_only_jan2017.RData", sep="/"))

rm(wecare_exac.df, wecare_genotypes.mx, wecare_kgen.df, wecare_nfe_exac.df, wecare_nfe_genotypes.mx, wecare_nfe_kgen.df, wecare_nfe_phenotypes.df, wecare_nfe_variants.df, wecare_phenotypes.df, wecare_variants.df)

# eigenvectors
r_evectors.df <- as.data.frame(wecare_eigen$vectors[,1:10])

colnames(r_evectors.df) <- c("r_ev1", "r_ev2", "r_ev3", "r_ev4", "r_ev5", "r_ev6", "r_ev7", "r_ev8", "r_ev9", "r_ev10")
str(r_evectors.df)
## 'data.frame':    480 obs. of  10 variables:
##  $ r_ev1 : num  0.00878 0.00978 -0.0044 -0.00765 -0.00231 ...
##  $ r_ev2 : num  -0.05413 -0.02963 0.00248 0.00195 0.00781 ...
##  $ r_ev3 : num  0.010002 0.009256 0.000475 0.006905 0.00144 ...
##  $ r_ev4 : num  0.02115 0.00508 0.00383 0.00149 -0.00243 ...
##  $ r_ev5 : num  -0.184168 -0.092435 0.000648 0.003585 0.016433 ...
##  $ r_ev6 : num  0.024713 0.026999 0.001998 0.003521 0.000813 ...
##  $ r_ev7 : num  -0.02331 -0.01648 0.00636 0.00427 0.01293 ...
##  $ r_ev8 : num  -4.52e-02 3.42e-03 6.41e-03 -9.41e-04 9.08e-06 ...
##  $ r_ev9 : num  0.000288 -0.002909 -0.010841 0.008353 0.008717 ...
##  $ r_ev10: num  0.02773 -0.0363 0.00986 0.00429 0.00511 ...
# Common sence checks (using properties of eigenvectors)
apply(r_evectors.df,2,sum) # should be close to zeros
##         r_ev1         r_ev2         r_ev3         r_ev4         r_ev5 
## -1.398621e-16 -1.932672e-15  3.466357e-15 -1.507583e-16  8.426094e-15 
##         r_ev6         r_ev7         r_ev8         r_ev9        r_ev10 
## -3.342287e-15  1.714937e-16  1.890185e-15  1.548837e-15 -2.406604e-15
apply(r_evectors.df,2,function(x){sum(x^2)}) # should be close to ones
##  r_ev1  r_ev2  r_ev3  r_ev4  r_ev5  r_ev6  r_ev7  r_ev8  r_ev9 r_ev10 
##      1      1      1      1      1      1      1      1      1      1
# eigenvalues
r_evalues.df <- as.data.frame(wecare_eigen$values)
colnames(r_evalues.df) <- "evals"
plot(r_evalues.df$evals)

# --- es-calculated data --- #

es_evectors_file="/scratch/medgen/scripts/wecare_stat_01.17/interim_data/u01_wecare_only_480_226k_eigenstrat_no_outliers/u01_wecare_only_480_226k_eigenstrat_no_outliers.evec"

es_evectors.df <- read.table(es_evectors_file)

colnames(es_evectors.df) <- c("wes_id", "es_ev1", "es_ev2", "es_ev3", "es_ev4", "es_ev5", "es_ev6", "es_ev7", "es_ev8", "es_ev9", "es_ev10", "group")

str(es_evectors.df)
## 'data.frame':    480 obs. of  12 variables:
##  $ wes_id : Factor w/ 480 levels "P1_A01","P1_A02",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ es_ev1 : num  0.0157 0.0146 -0.0049 -0.0081 -0.002 -0.0064 -0.0087 -0.011 0.0283 -0.0077 ...
##  $ es_ev2 : num  -0.0558 -0.0315 0.0022 0.0014 0.0078 0.0066 0.0037 0.0073 -0.0578 0.0053 ...
##  $ es_ev3 : num  -0.0085 -0.0083 0.0002 -0.0072 -0.0017 -0.0026 -0.0034 -0.0008 -0.006 -0.0066 ...
##  $ es_ev4 : num  -0.0376 -0.0142 -0.0037 -0.0017 0.0035 0.0047 0.0029 0.0045 -0.0273 0.003 ...
##  $ es_ev5 : num  -0.1792 -0.0915 0.0005 0.0035 0.0154 ...
##  $ es_ev6 : num  -0.0214 -0.024 -0.0018 -0.004 -0.0011 0.0056 0.0021 -0.0024 -0.0151 -0.0041 ...
##  $ es_ev7 : num  0.0058 0.0172 -0.0024 -0.0043 -0.0132 0.0068 0.0044 -0.0011 0.0261 -0.0072 ...
##  $ es_ev8 : num  -0.0533 -0.0026 0.0073 0.0009 0.0053 0.0028 -0.0051 -0.0028 -0.028 0.0031 ...
##  $ es_ev9 : num  -0.0038 0.0026 0.0117 -0.0094 -0.0083 -0.0074 0.0059 -0.006 -0.0276 -0.0017 ...
##  $ es_ev10: num  0.0778 -0.0383 0.0068 -0.0006 0.0075 0 0.0016 -0.0029 0.0561 -0.0015 ...
##  $ group  : Factor w/ 2 levels "CBC","UBC": 1 1 1 1 1 1 2 1 1 2 ...
# Common sence checks (using properties of eigenvectors)
apply(es_evectors.df[,2:11],2,sum) # should be close to zeros
##  es_ev1  es_ev2  es_ev3  es_ev4  es_ev5  es_ev6  es_ev7  es_ev8  es_ev9 
## -0.0001  0.0003  0.0001 -0.0003 -0.0001  0.0010 -0.0004 -0.0008  0.0015 
## es_ev10 
##  0.0002
apply(es_evectors.df[,2:11],2,function(x){sum(x^2)}) # should be close to ones
##    es_ev1    es_ev2    es_ev3    es_ev4    es_ev5    es_ev6    es_ev7 
## 1.0000400 1.0000222 1.0000036 0.9999435 1.0000308 0.9999554 1.0000354 
##    es_ev8    es_ev9   es_ev10 
## 0.9998510 1.0000525 1.0000621
es_evalues_file <- "/scratch/medgen/scripts/wecare_stat_01.17/interim_data/u01_wecare_only_480_226k_eigenstrat_no_outliers/u01_wecare_only_480_226k_eigenstrat_no_outliers.eval"

es_evalues.df <- read.table(es_evalues_file)
colnames(es_evalues.df) <- "evals"
str(es_evalues.df)
## 'data.frame':    480 obs. of  1 variable:
##  $ evals: num  2.78 2.41 2.3 2.06 2.03 ...
plot(es_evalues.df$evals)

rm(es_evectors_file, es_evalues_file)

compare_evalues

plot(es_evalues.df$evals, r_evalues.df$evals, main="eigenvalues: R vs EIGENSTRAT")

abline(c(0,0),c(1,1), col="red")

compare_evectors

plot(es_evectors.df$es_ev1, r_evectors.df$r_ev1 , main="1st eigenvectors: R vs EIGENSTRAT")
abline(c(0,0),c(1,1), col="red")

# Prepare dataframe
joined_evectors.df <- cbind(es_evectors.df, r_evectors.df)

# Prepare colour scale
colours <- c("UBC" = "BLUE", "CBC" = "RED")
userColourScale <- scale_colour_manual(values=colours)

# --- 1st eigenvectors --- #

# Non-interactive plot
ggplot(joined_evectors.df, aes(es_ev1, r_ev1)) +
  geom_point(aes(colour=group, fill=group)) + 
  labs(title="1st eigenvectors: R vs EIGENSTRAT") +
  geom_abline(colour = "brown", linetype = 2) + 
  userColourScale

# Interactive plot
g <- ggplot(joined_evectors.df, aes(es_ev1, r_ev1)) +
  geom_point(aes(colour=group, fill=group, text=wes_id)) + 
  labs(title="1st eigenvectors: R vs EIGENSTRAT") +
  geom_abline(colour = "brown", linetype = 2) + 
  userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
# --- 2nd eigenvectors --- #

# Non-interactive plot
ggplot(joined_evectors.df, aes(es_ev2, r_ev2)) +
  geom_point(aes(colour=group, fill=group)) + 
  labs(title="2nd eigenvectors: R vs EIGENSTRAT") +
  geom_abline(colour = "brown", linetype = 2) + 
  userColourScale

# Interactive plot
g <- ggplot(joined_evectors.df, aes(es_ev2, r_ev2)) +
  geom_point(aes(colour=group, fill=group, text=wes_id)) + 
  labs(title="2nd eigenvectors: R vs EIGENSTRAT") +
  geom_abline(colour = "brown", linetype = 2) + 
  userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)
# --- 3rd eigenvectors --- #

# Non-interactive plot
ggplot(joined_evectors.df, aes(es_ev3, r_ev3)) +
  geom_point(aes(colour=group, fill=group)) + 
  labs(title="3rd eigenvectors: R vs EIGENSTRAT") +
  geom_abline(slope=-1, colour = "brown", linetype = 2) + 
  userColourScale

# Interactive plot
g <- ggplot(joined_evectors.df, aes(es_ev3, r_ev3)) +
  geom_point(aes(colour=group, fill=group, text=wes_id)) + 
  labs(title="3rd eigenvectors: R vs EIGENSTRAT") +
  geom_abline(slope = -1, colour = "brown", linetype = 2) + 
  userColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g)

save_data

rm(g)
save.image(paste(interim_data_folder, "r05_compare_egenvectors_wecare_only_jan2017.RData", sep="/"))

final_section

ls()
## [1] "colours"             "es_evalues.df"       "es_evectors.df"     
## [4] "interim_data_folder" "joined_evectors.df"  "r_evalues.df"       
## [7] "r_evectors.df"       "userColourScale"     "wecare_eigen"
sessionInfo()
## R version 3.2.3 (2015-12-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Scientific Linux release 6.8 (Carbon)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] plotly_4.5.6  ggplot2_2.2.1 dplyr_0.5.0  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.6       knitr_1.13        magrittr_1.5     
##  [4] munsell_0.4.3     viridisLite_0.1.3 colorspace_1.2-6 
##  [7] R6_2.1.2          httr_1.2.1        stringr_1.0.0    
## [10] plyr_1.8.4        tools_3.2.3       grid_3.2.3       
## [13] gtable_0.2.0      DBI_0.5           htmltools_0.3.5  
## [16] yaml_2.1.13       lazyeval_0.2.0    assertthat_0.1   
## [19] digest_0.6.10     tibble_1.1        tidyr_0.5.1      
## [22] purrr_0.2.2       formatR_1.4       base64enc_0.1-3  
## [25] htmlwidgets_0.8   evaluate_0.9      rmarkdown_1.0    
## [28] labeling_0.3      stringi_1.1.1     scales_0.4.1     
## [31] jsonlite_1.0
Sys.time()
## [1] "2017-01-30 20:05:13 GMT"